/*
fpu_lib.h - Floating-point handling library
Copyright (C) 2024  LekKit <github.com/LekKit>

This Source Code Form is subject to the terms of the Mozilla Public
License, v. 2.0. If a copy of the MPL was not distributed with this
file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

#ifndef LEKKIT_FPU_LIB_H
#define LEKKIT_FPU_LIB_H

#include "fpu_types.h"
#include "mem_ops.h"

/*
 * NOTE: Floating-point is actually subtly (or seriously) broken on many compilers and platforms
 *
 * Signaling NaNs are silently converted to quit NaNs on 8087 FPU (Legacy x86 FPU before SSE2):
 * - The conversion happens upon storing a 8087 register into memory, together with other transformations
 * - This is possibly the most convoluted issue to fix, and a major reason this library exists at all
 * Workaround: Introduce fpu_fp32_t/fpu_fp64_t typedefs, which are only unwrapped to native floating-point
 *   type during an actual FP operation, preventing involuntary changes to the floating-point representation.
 *   Signaling NaN checks are implemented in software.
 * USE_SOFT_FPU_WRAP is enabled on i386 and m68k, and allows proper IEEE 754 conformance.
 *
 * Signaling comparisons (<, <=) may be miscompiled into quiet comparisons:
 * - Older GCC <8.1, Clang <12.0 emit quiet <, <= comparisons on x86, arm64
 * - Clang 12.0+ requires -frounding-math or #pragma STDC FENV_ACCESS ON
 * - Quiet (unordered) compares are still emitted on arm32, powerpc, loongarch64...
 * - ChibiCC, Cproc compilers emit quiet <, <= comparisons (ucomiss instead of comiss)
 * - SlimCC compiler (ChibiCC fork) fixes this
 * Workaround: Manually raise FE_INVALID if neither a < b nor b <= a
 *
 * Quiet comparisons (__builtin_isless) may be miscompiled or inefficient:
 * - Signaling comparisons are emitted on RISC-V instead
 * Workaround: Manually check !fpu_is_nan() before <, <= comparison
 *
 * Converting floats into 64-bit integers may break FE_INEXACT semantics:
 * - For inexact fp->i64 conversions, FE_INEXACT is missed on riscv32, arm32, powerpc32, loongarch64
 * - Same issue with TCC compiler
 * - Cproc compiler erroneously raises FE_INEXACT for exact fp->i64 conversions
 * Workaround: Manually raise FE_INEXACT on inexact conversion
 *
 * Converting floats into unsigned integers may be broken:
 * - Cproc compiler uses cvtss2si for fp->u64 conversions, which is incorrect for >INT64_MAX inputs
 * - Same for XCC compiler, and fp->u32 conversions are also broken in same manner
 * Workaround: Implement fp->u32 conversion as fp->i64, fp->u64 conversion in software for >INT_MAX inputs
 *
 * Converting unsigned integers into floats may be broken:
 * - ChibiCC compiler uses cvtsi2ss for u64->fp conversions, which is incorrect for >INT64_MAX inputs
 * - Same for XCC compiler, and u32->fp conversions are also broken in same manner
 * - SlimCC compiler (ChibiCC fork) fixes this
 * Workaround: Implement u32->fp conversion as i64->fp, u64->fp conversion in software for >INT_MAX inputs
 *
 * Intel OneAPI compiler enables FTZ/DAZ by default (Among other things):
 * - Enabling "flush to zero" or "denormals are zero" breaks denormals and causes test failure
 * Workaround: Pass -fno-fast-math at build time, clear FTZ/DAZ in MXCSR handling code
 *
 * Calling into libm may break FE_INEXACT/FE_INVALID semantics:
 * - Windows fails to set FE_INVALID on sqrt(-1), even when using MinGW
 * - Musl has FENV_SUPPORT as optional, and may fail to set FE_INEXACT, etc
 * Workaround: Handle FE_INVALID manually, prefer native instructions
 *
 * Calling into libm may be completely broken:
 * - WinNT 3.x/4.0, Win95 lack SSE state handling, any libm function that uses SSE crashes
 * - Determinism is desired, yet various libm implementation may have non-bit-exact results
 * Workaround: Handle fenv in asembly, prefer native instructions
 *
 * WASM, mips/mips64, m68k, hexagon, Windows CE (ARM32) platforms lack FENV completely:
 * - Basically what the title says. None of those have a working <fenv.h> implementation.
 * - There is no way to implement fesetround() anyways, at least as far as I'm aware
 * Workaround: Thread-local exception flags, which are raised manually based on various checks
 *
 * NOTE: RISC-V THead CPUs also lack FPU exceptions, but I'm hesitant to enable such workaround
 * on RISC-V in general as this case is a single non-conforming hardware implementation.
 *
 * This list is not even considering various historical compilers, and MSVC. To be continued?...
 *
 * Because of this, and because most workarounds have little performance impact, consider limited
 * list of compilers/platforms a "Known Nice Platform" (C) and apply workarounds otherwise.
 */

#if defined(__wasm__) || defined(__mips__) || defined(__mips) || defined(__mips64__) || defined(__mips64) /**/         \
    || defined(__m68k__) || defined(__hexagon__) || (defined(_WIN32) && defined(_M_ARM)) || !CHECK_INCLUDE(fenv.h, 1)
// Enable FENV emulation
#undef USE_SOFT_FPU_FENV
#define USE_SOFT_FPU_FENV 1
#endif

#if CLANG_CHECK_VER(12, 0)
// Fix various floating-point misoptimizations. Why the fuck is Clang not IEEE 754 compliant by default???
#if !defined(__INTEL_LLVM_COMPILER) /**/                                                                               \
    && !defined(__i386__) && !defined(__x86_64__) && !defined(__aarch64__) && !defined(__riscv_d)
// This pragma fixes build on Intel OneAPI compiler, and Clang sqrt() behavior at least on powerpc
// Emabling this flag on x86/arm64/riscv needlessly pessimizes the codegen
#pragma float_control(precise)
#endif
#pragma STDC FENV_ACCESS ON
#elif defined(COMPILER_IS_MSVC)
#pragma fenv_access(on)
#endif

// GCC 8.1+, Clang 12.0+, SlimCC respect IEEE 754
#undef FPU_LIB_KNOWN_SANE_COMPILER
#if !defined(USE_FPU_WORKAROUNDS) /**/                                                                                 \
    && (GCC_CHECK_VER(8, 1) || CLANG_CHECK_VER(12, 0) || defined(__slimcc__))
#define FPU_LIB_KNOWN_SANE_COMPILER 1
#endif

// Conversion uint->fp is correct on modern GCC/Clang, SlimCC
#undef FPU_LIB_CORRECT_CONVERSION_UINT_FP
#if defined(FPU_LIB_KNOWN_SANE_COMPILER)
#define FPU_LIB_CORRECT_CONVERSION_UINT_FP 1
#endif

// Conversion fp->uint is correct on i386, x86_64, arm64, riscv64, powerpc64, mips64
#undef FPU_LIB_CORRECT_CONVERSION_FP_UINT
#if !defined(USE_SOFT_FPU_FENV) && defined(FPU_LIB_KNOWN_SANE_COMPILER)        /**/                                    \
    && (defined(__i386__) || defined(__x86_64__)                               /**/                                    \
        || defined(__aarch64__) || (defined(__riscv_d) && defined(HOST_64BIT)) /**/                                    \
        || defined(__powerpc64__) || defined(__mips64__))
#define FPU_LIB_CORRECT_CONVERSION_FP_UINT 1
#endif

// Comparison via <, <= is signaling on i386, x86_64, arm64, riscv
#undef FPU_LIB_CORRECT_SIGNALING_COMPARE
#if !defined(USE_SOFT_FPU_FENV) && defined(FPU_LIB_KNOWN_SANE_COMPILER) /**/                                           \
    && (defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) || defined(__riscv_d))
#define FPU_LIB_CORRECT_SIGNALING_COMPARE 1
#endif

// Comparison via __builtin_isless() is quiet on i386, x86_64, arm, arm64, powerpc
#undef FPU_LIB_CORRECT_BUILTIN_QUIET_COMPARE
#if !defined(USE_SOFT_FPU_FENV) && defined(FPU_LIB_KNOWN_SANE_COMPILER)    /**/                                        \
    && (defined(__i386__) || defined(__x86_64__)                           /**/                                        \
        || defined(__arm__) || defined(__aarch64__)                        /**/                                        \
        || defined(__powerpc__) || defined(__powerpc64__))                 /**/                                        \
    && GNU_BUILTIN(__builtin_isless) && GNU_BUILTIN(__builtin_islessequal) /**/                                        \
    && GNU_BUILTIN(__builtin_isgreater) && GNU_BUILTIN(__builtin_isgreaterequal)
#define FPU_LIB_CORRECT_BUILTIN_QUIET_COMPARE 1
#endif

// Min/Max via __builtin_fmin() conforms to IEEE 754-2008 on riscv
#undef FPU_LIB_CORRECT_BUILTIN_IEEE754_2008_FMINMAX
#if !defined(USE_SOFT_FPU_FENV) && defined(FPU_LIB_KNOWN_SANE_COMPILER) /**/                                           \
    && defined(__riscv_f) && defined(__riscv_d)                         /**/                                           \
    && GNU_BUILTIN(__builtin_fminf) && GNU_BUILTIN(__builtin_fmin)      /**/                                           \
    && GNU_BUILTIN(__builtin_fmaxf) && GNU_BUILTIN(__builtin_fmax)
#define FPU_LIB_CORRECT_BUILTIN_IEEE754_2008_FMINMAX 1
#endif

// Copying sign via __builtin_copysign() is observably more optimal on x86_64, arm64, riscv
#undef FPU_LIB_OPTIMAL_BUILTIN_COPYSIGN
#if !defined(USE_SOFT_FPU_WRAP) && defined(FPU_LIB_KNOWN_SANE_COMPILER)    /**/                                        \
    && (defined(__x86_64__) || defined(__aarch64__) || defined(__riscv_d)) /**/                                        \
    && GNU_BUILTIN(__builtin_copysign) && GNU_BUILTIN(__builtin_copysignf)
#define FPU_LIB_OPTIMAL_BUILTIN_COPYSIGN 1
#endif

// Fused-multiply-add via __builtin_fma() is optimal on arm64, riscv, powerpc
#undef FPU_LIB_OPTIMAL_BUILTIN_FMA
#if !defined(USE_SOFT_FPU_FENV)                                                                       /**/             \
    && (defined(__aarch64__) || defined(__riscv_d) || defined(__powerpc__) || defined(__powerpc64__)) /**/             \
    && GNU_BUILTIN(__builtin_fmaf) && GNU_BUILTIN(__builtin_fma)
#define FPU_LIB_OPTIMAL_BUILTIN_FMA 1
#endif

/*
 * Hide <math.h> definitions from generic code if possible - it should not be used
 */
#if GNU_BUILTIN(__builtin_sqrt) && GNU_BUILTIN(__builtin_sqrtf)
#define fpu_sqrt_internal(d)  __builtin_sqrt(d)
#define fpu_sqrtf_internal(f) __builtin_sqrtf(f)
#else
#include <math.h>
#define fpu_sqrt_internal(d)  sqrt(d)
#define fpu_sqrtf_internal(f) sqrtf(f)
#endif

/*
 * FPU Environment handling
 *
 * NOTE: Those definitions match RISC-V bitfields :D
 */

// FPU Exception flags
#define FPU_LIB_FLAG_NX         0x01 // Inexact result
#define FPU_LIB_FLAG_UF         0x02 // Underflow
#define FPU_LIB_FLAG_OF         0x04 // Overflow
#define FPU_LIB_FLAG_DZ         0x08 // Division by zero
#define FPU_LIB_FLAG_NV         0x10 // Invalid operation

#define FPU_LIB_FLAGS_ALL       0x1F

// FPU Rounding modes
#define FPU_LIB_ROUND_NE        0x00 // Round to nearest
#define FPU_LIB_ROUND_TZ        0x01 // Round to zero
#define FPU_LIB_ROUND_DN        0x02 // Round down (Towards negative infinity)
#define FPU_LIB_ROUND_UP        0x03 // Round up (Towards positive infinity)
#define FPU_LIB_ROUND_MM        0x04 // Round to nearest, ties to Max Magnitude

// FPU Classification
#define FPU_CLASS_NEG_INF       0x00 // Negative infinity
#define FPU_CLASS_NEG_NORMAL    0x01 // Negative normal
#define FPU_CLASS_NEG_SUBNORMAL 0x02 // Negative subnormal
#define FPU_CLASS_NEG_ZERO      0x03 // Negative zero
#define FPU_CLASS_POS_ZERO      0x04 // Positive zero
#define FPU_CLASS_POS_SUBNORMAL 0x05 // Positive subnormal
#define FPU_CLASS_POS_NORMAL    0x06 // Positive normal
#define FPU_CLASS_POS_INF       0x07 // Positive infinity
#define FPU_CLASS_NAN_SIG       0x08 // Signaling NaN
#define FPU_CLASS_NAN_QUIET     0x09 // Quitet NaN

// Exception handling
uint32_t fpu_get_exceptions(void);
void     fpu_set_exceptions(uint32_t new_exceptions);
void     fpu_raise_exceptions(uint32_t set_exceptions);
void     fpu_clear_exceptions(uint32_t clr_exceptions);

// Rounding mode handling
uint32_t fpu_get_rounding_mode(void);
void     fpu_set_rounding_mode(uint32_t mode);

// Floating-point Classification
slow_path uint32_t fpu_fclass32(fpu_f32_t f);
slow_path uint32_t fpu_fclass64(fpu_f64_t d);

// Internal rounding helpers (Only to be coverted to an integer afterwards)
slow_path fpu_f32_t fpu_round_f32_internal(fpu_f32_t f, uint32_t mode);
slow_path fpu_f64_t fpu_round_f64_internal(fpu_f64_t d, uint32_t mode);

// Unintrusive soft exception handling for inlined paths
slow_path void fpu_raise_inexact(void);
slow_path void fpu_raise_invalid(void);
slow_path void fpu_raise_ovrflow(void);
slow_path void fpu_raise_divzero(void);

#if defined(USE_SOFT_FPU_FENV)
slow_path void fpu_soft_fenv_check_add32(fpu_f32_t sum, fpu_f32_t a, fpu_f32_t b);
slow_path void fpu_soft_fenv_check_add64(fpu_f64_t sum, fpu_f64_t a, fpu_f64_t b);
slow_path void fpu_soft_fenv_check_mul32(fpu_f32_t mul, fpu_f32_t a, fpu_f32_t b, bool is_div);
slow_path void fpu_soft_fenv_check_mul64(fpu_f64_t sum, fpu_f64_t a, fpu_f64_t b, bool is_div);
#endif

/*
 * Floating-point bitcasts
 */

static forceinline uint32_t fpu_bit_f32_to_u32(fpu_f32_t f)
{
#if defined(USE_SOFT_FPU_WRAP) && defined(USE_SOFT_FPU_ENCAP)
    return f.scalar;
#elif defined(USE_SOFT_FPU_WRAP)
    return f;
#else
    uint32_t u;
    memcpy(&u, &f, sizeof(f));
    return u;
#endif
}

static forceinline fpu_f32_t fpu_bit_u32_to_f32(uint32_t u)
{
#if defined(USE_SOFT_FPU_WRAP) && defined(USE_SOFT_FPU_ENCAP)
    fpu_f32_t ret = {.scalar = u};
    return ret;
#elif defined(USE_SOFT_FPU_WRAP)
    return u;
#else
    fpu_f32_t f;
    memcpy(&f, &u, sizeof(f));
    return f;
#endif
}

static forceinline uint64_t fpu_bit_f64_to_u64(fpu_f64_t d)
{
#if defined(USE_SOFT_FPU_WRAP) && defined(USE_SOFT_FPU_ENCAP)
    return d.scalar;
#elif defined(USE_SOFT_FPU_WRAP)
    return d;
#else
    uint64_t u;
    memcpy(&u, &d, sizeof(d));
    return u;
#endif
}

static forceinline fpu_f64_t fpu_bit_u64_to_f64(uint64_t u)
{
#if defined(USE_SOFT_FPU_WRAP) && defined(USE_SOFT_FPU_ENCAP)
    fpu_f64_t ret = {.scalar = u};
    return ret;
#elif defined(USE_SOFT_FPU_WRAP)
    return u;
#else
    fpu_f64_t d;
    memcpy(&d, &u, sizeof(d));
    return d;
#endif
}

static forceinline bool fpu_is_bit_equal32(fpu_f32_t a, fpu_f32_t b)
{
    return fpu_bit_f32_to_u32(a) == fpu_bit_f32_to_u32(b);
}

static forceinline bool fpu_is_bit_equal64(fpu_f64_t a, fpu_f64_t b)
{
    return fpu_bit_f64_to_u64(a) == fpu_bit_f64_to_u64(b);
}

/*
 * Software wrapping of scalar floating-point
 */

static forceinline actual_float_t fpu_raw_f32(fpu_f32_t f)
{
#if defined(USE_SOFT_FPU_WRAP)
    actual_float_t ret;
    memcpy(&ret, &f, sizeof(f));
    return ret;
#elif defined(USE_SOFT_FPU_ENCAP)
    return f.scalar;
#else
    return f;
#endif
}

static forceinline actual_double_t fpu_raw_f64(fpu_f64_t d)
{
#if defined(USE_SOFT_FPU_WRAP)
    actual_double_t ret;
    memcpy(&ret, &d, sizeof(d));
    return ret;
#elif defined(USE_SOFT_FPU_ENCAP)
    return d.scalar;
#else
    return d;
#endif
}

static forceinline fpu_f32_t fpu_wrap_f32(actual_float_t f)
{
#if defined(USE_SOFT_FPU_WRAP)
    fpu_f32_t ret;
    memcpy(&ret, &f, sizeof(f));
    return ret;
#elif defined(USE_SOFT_FPU_ENCAP)
    fpu_f32_t ret = {.scalar = f};
    return ret;
#else
    return f;
#endif
}

static forceinline fpu_f64_t fpu_wrap_f64(actual_double_t d)
{
#if defined(USE_SOFT_FPU_WRAP)
    fpu_f64_t ret;
    memcpy(&ret, &d, sizeof(d));
    return ret;
#elif defined(USE_SOFT_FPU_ENCAP)
    fpu_f64_t ret = {.scalar = d};
    return ret;
#else
    return d;
#endif
}

/*
 * Fast software floating-point helpers
 */

#define FPU_LIB_FP32_SIGNEDFP_MASK 0x80000000U
#define FPU_LIB_FP32_NOSIGNED_MASK 0x7FFFFFFFU
#define FPU_LIB_FP32_EXPONENT_MASK 0x7F800000U
#define FPU_LIB_FP32_MANTISSA_MASK 0x007FFFFFU
#define FPU_LIB_FP32_QUIETNAN_MASK 0x00400000U

#define FPU_LIB_FP32_CANONICAL_NAN 0x7FC00000U
#define FPU_LIB_FP32_POSITIVE_INF  0x7F800000U
#define FPU_LIB_FP32_NEGATIVE_INF  0xFF800000U

#define FPU_LIB_FP64_SIGNEDFP_MASK 0x8000000000000000ULL
#define FPU_LIB_FP64_NOSIGNED_MASK 0x7FFFFFFFFFFFFFFFULL
#define FPU_LIB_FP64_EXPONENT_MASK 0x7FF0000000000000ULL
#define FPU_LIB_FP64_MANTISSA_MASK 0x000FFFFFFFFFFFFFULL
#define FPU_LIB_FP64_QUIETNAN_MASK 0x0008000000000000ULL

#define FPU_LIB_FP64_CANONICAL_NAN 0x7FF8000000000000ULL
#define FPU_LIB_FP64_POSITIVE_INF  0x7FF0000000000000ULL
#define FPU_LIB_FP64_NEGATIVE_INF  0xFFF0000000000000ULL

// Checks for finite non-NaN, non-Inf input, never sets exceptions
static forceinline bool fpu_is_finite32(fpu_f32_t f)
{
    return (fpu_bit_f32_to_u32(f) << 1) < (FPU_LIB_FP32_EXPONENT_MASK << 1);
}

static forceinline bool fpu_is_finite64(fpu_f64_t d)
{
    return (fpu_bit_f64_to_u64(d) << 1) < (FPU_LIB_FP64_EXPONENT_MASK << 1);
}

// Checks for positive, possibly infinite non-NaN input, never sets exceptions
static forceinline bool fpu_is_positive32(fpu_f32_t f)
{
    return fpu_bit_f32_to_u32(f) <= FPU_LIB_FP32_POSITIVE_INF;
}

static forceinline bool fpu_is_positive64(fpu_f64_t d)
{
    return fpu_bit_f64_to_u64(d) <= FPU_LIB_FP64_POSITIVE_INF;
}

// Checks for negative, possibly infinite non-NaN input, never sets exceptions
static forceinline bool fpu_is_negative32(fpu_f32_t f)
{
    int32_t i = (int32_t)fpu_bit_f32_to_u32(f);
    return i <= (int32_t)FPU_LIB_FP32_NEGATIVE_INF;
}

static forceinline bool fpu_is_negative64(fpu_f64_t d)
{
    int64_t i = (int64_t)fpu_bit_f64_to_u64(d);
    return i <= (int64_t)FPU_LIB_FP64_NEGATIVE_INF;
}

// Checks for NaN input, never sets exceptions
static forceinline bool fpu_is_nan32_soft(fpu_f32_t f)
{
    return (fpu_bit_f32_to_u32(f) << 1) > (FPU_LIB_FP32_POSITIVE_INF << 1);
}

static forceinline bool fpu_is_nan64_soft(fpu_f64_t d)
{
    return (fpu_bit_f64_to_u64(d) << 1) > (FPU_LIB_FP64_POSITIVE_INF << 1);
}

// Checks for sNaN input, never sets exceptions
static forceinline bool fpu_is_snan32_soft(fpu_f32_t f)
{
    uint32_t u = fpu_bit_f32_to_u32(f) << 1;
    return (u - ((FPU_LIB_FP32_POSITIVE_INF << 1) + 1)) < ((FPU_LIB_FP32_QUIETNAN_MASK << 1) - 1);
}

static forceinline bool fpu_is_snan64_soft(fpu_f64_t d)
{
    uint64_t u = fpu_bit_f64_to_u64(d) << 1;
    return (u - ((FPU_LIB_FP64_POSITIVE_INF << 1) + 1)) < ((FPU_LIB_FP64_QUIETNAN_MASK << 1) - 1);
}

// Returns normalized exponent value
static forceinline int32_t fpu_exponent32(fpu_f32_t f)
{
    return ((int32_t)((fpu_bit_f32_to_u32(f) >> 23) & 0xFF)) - 0x7F;
}

static forceinline int32_t fpu_exponent64(fpu_f64_t f)
{
    return ((int32_t)((fpu_bit_f64_to_u64(f) >> 52) & 0x7FF)) - 0x3FF;
}

// Returns true on non-zero fractional part
static forceinline bool fpu_is_fractional32(fpu_f32_t f)
{
    uint32_t u = fpu_bit_f32_to_u32(f);
    if (u << 1) {
        int32_t e = fpu_exponent32(f);
        if (e < 0) {
            return true;
        } else if (e < 23) {
            return !!(u & (FPU_LIB_FP32_MANTISSA_MASK >> e));
        }
    }
    return false;
}

static forceinline bool fpu_is_fractional64(fpu_f64_t d)
{
    uint64_t u = fpu_bit_f64_to_u64(d);
    if (u << 1) {
        int32_t e = fpu_exponent64(d);
        if (e < 0) {
            return true;
        } else if (e < 52) {
            return !!(u & (FPU_LIB_FP64_MANTISSA_MASK >> e));
        }
    }
    return false;
}

/*
 * Floating-point precision error calculation
 *
 * See https://ir.cwi.nl/pub/9159/9159D.pdf
 */

static inline fpu_f32_t fpu_add_error32(fpu_f32_t sum, fpu_f32_t a, fpu_f32_t b)
{
    actual_float_t fs = fpu_raw_f32(sum);
    actual_float_t fa = fpu_raw_f32(a);
    actual_float_t fb = fpu_raw_f32(b);
    actual_float_t fi = fs - fa;
    return fpu_wrap_f32((fb - fi) + (fa - (fs - fi)));
}

static inline fpu_f64_t fpu_add_error64(fpu_f64_t sum, fpu_f64_t a, fpu_f64_t b)
{
    actual_double_t fs = fpu_raw_f64(sum);
    actual_double_t fa = fpu_raw_f64(a);
    actual_double_t fb = fpu_raw_f64(b);
    actual_double_t fi = fs - fa;
    return fpu_wrap_f64((fb - fi) + (fa - (fs - fi)));
}

static inline fpu_f32_t fpu_mul_error32(fpu_f32_t mul, fpu_f32_t a, fpu_f32_t b)
{
    actual_float_t fm = fpu_raw_f32(mul);
    actual_float_t ha = fpu_raw_f32(fpu_bit_u32_to_f32(fpu_bit_f32_to_u32(a) & ~0x7FFU));
    actual_float_t la = fpu_raw_f32(a) - ha;
    actual_float_t hb = fpu_raw_f32(fpu_bit_u32_to_f32(fpu_bit_f32_to_u32(b) & ~0x7FFU));
    actual_float_t lb = fpu_raw_f32(b) - hb;
    return fpu_wrap_f32((((ha * hb - fm) + ha * lb) + la * hb) + la * lb);
}

static inline fpu_f64_t fpu_mul_error64(fpu_f64_t mul, fpu_f64_t a, fpu_f64_t b)
{
    actual_double_t fm = fpu_raw_f64(mul);
    actual_double_t ha = fpu_raw_f64(fpu_bit_u64_to_f64(fpu_bit_f64_to_u64(a) & ~0x7FFFFFFULL));
    actual_double_t la = fpu_raw_f64(a) - ha;
    actual_double_t hb = fpu_raw_f64(fpu_bit_u64_to_f64(fpu_bit_f64_to_u64(b) & ~0x7FFFFFFULL));
    actual_double_t lb = fpu_raw_f64(b) - hb;
    return fpu_wrap_f64((((ha * hb - fm) + ha * lb) + la * hb) + la * lb);
}

/*
 * Floating-point odd rounding against error value for precision fixup
 */

static inline fpu_f32_t fpu_odd_round32(fpu_f32_t val, fpu_f32_t err)
{
    uint32_t uv = fpu_bit_f32_to_u32(val);
    uint32_t ue = fpu_bit_f32_to_u32(err);
    if (ue && !(uv & 1)) {
        if (ue >> 31) {
            return fpu_bit_u32_to_f32(uv - 1);
        } else {
            return fpu_bit_u32_to_f32(uv + 1);
        }
    }
    return val;
}

static inline fpu_f64_t fpu_odd_round64(fpu_f64_t val, fpu_f64_t err)
{
    uint64_t uv = fpu_bit_f64_to_u64(val);
    uint64_t ue = fpu_bit_f64_to_u64(err);
    if (ue && !(uv & 1)) {
        if (ue >> 63) {
            return fpu_bit_u64_to_f64(uv - 1);
        } else {
            return fpu_bit_u64_to_f64(uv + 1);
        }
    }
    return val;
}

/*
 * NaN-boxing, explicit endian load/store intrinsics
 */

static forceinline fpu_f64_t fpu_nanbox_f32(fpu_f32_t f)
{
    return fpu_bit_u64_to_f64(fpu_bit_f32_to_u32(f) | 0xFFFFFFFF00000000ULL);
}

static forceinline fpu_f32_t fpu_unpack_f32_from_f64(fpu_f64_t d)
{
    return fpu_bit_u32_to_f32(fpu_bit_f64_to_u64(d));
}

static forceinline fpu_f32_t fpu_nan_unbox_f32(fpu_f64_t d)
{
    uint64_t u = fpu_bit_f64_to_u64(d);
    if (likely(u >= 0xFFFFFFFF00000000ULL)) {
        return fpu_bit_u32_to_f32(u);
    }
    return fpu_bit_u32_to_f32(FPU_LIB_FP32_CANONICAL_NAN);
}

TSAN_SUPPRESS static forceinline fpu_f32_t fpu_load_f32_le(const void* addr)
{
#if defined(HOST_LITTLE_ENDIAN)
    return *(const safe_aliasing fpu_f32_t*)addr;
#else
    return fpu_bit_u32_to_f32(read_uint32_le(addr));
#endif
}

TSAN_SUPPRESS static forceinline void fpu_store_f32_le(void* addr, fpu_f32_t f)
{
#if defined(HOST_LITTLE_ENDIAN)
    *(safe_aliasing fpu_f32_t*)addr = f;
#else
    write_uint32_le(addr, fpu_bit_f32_to_u32(f));
#endif
}

TSAN_SUPPRESS static forceinline fpu_f64_t fpu_load_f64_le(const void* addr)
{
#if defined(HOST_LITTLE_ENDIAN)
    return *(const safe_aliasing fpu_f64_t*)addr;
#else
    return fpu_bit_u64_to_f64(read_uint64_le(addr));
#endif
}

TSAN_SUPPRESS static forceinline void fpu_store_f64_le(void* addr, fpu_f64_t d)
{
#if defined(HOST_LITTLE_ENDIAN)
    *(safe_aliasing fpu_f64_t*)addr = d;
#else
    write_uint64_le(addr, fpu_bit_f64_to_u64(d));
#endif
}

/*
 * NaN checking
 */

// Those functions raise FE_INVALID on sNaN, and that is commonly needed
static forceinline bool fpu_is_nan32(fpu_f32_t f)
{
#if defined(USE_SOFT_FPU_FENV)
    if (likely(!fpu_is_nan32_soft(f))) {
        return false;
    }
    if (fpu_is_snan32_soft(f)) {
        fpu_raise_invalid();
    }
    return true;
#else
    return fpu_raw_f32(f) != fpu_raw_f32(f);
#endif
}

static forceinline bool fpu_is_nan64(fpu_f64_t d)
{
#if defined(USE_SOFT_FPU_FENV)
    if (likely(!fpu_is_nan64_soft(d))) {
        return false;
    }
    if (fpu_is_snan64_soft(d)) {
        fpu_raise_invalid();
    }
    return true;
#else
    return fpu_raw_f64(d) != fpu_raw_f64(d);
#endif
}

// NOTE: Those functions assume FE_INVALID is already set
static forceinline bool fpu_is_nan32_fast_soft(fpu_f32_t f)
{
#if defined(USE_SOFT_FPU_FENV)
    return fpu_is_nan32_soft(f);
#else
    return fpu_raw_f32(f) != fpu_raw_f32(f);
#endif
}

static forceinline bool fpu_is_nan64_fast_soft(fpu_f64_t d)
{
#if defined(USE_SOFT_FPU_FENV)
    return fpu_is_nan64_soft(d);
#else
    return fpu_raw_f64(d) != fpu_raw_f64(d);
#endif
}

/*
 * Floating-point sign operations
 */

static forceinline bool fpu_signbit32(fpu_f32_t f)
{
    return !!(fpu_bit_f32_to_u32(f) >> 31);
}

static forceinline bool fpu_signbit64(fpu_f64_t d)
{
    return !!(fpu_bit_f64_to_u64(d) >> 63);
}

static forceinline fpu_f32_t fpu_neg32(fpu_f32_t f)
{
#if defined(USE_SOFT_FPU_WRAP)
    return fpu_bit_u32_to_f32(fpu_bit_f32_to_u32(f) ^ FPU_LIB_FP32_SIGNEDFP_MASK);
#else
    return fpu_wrap_f32(-fpu_raw_f32(f));
#endif
}

static forceinline fpu_f64_t fpu_neg64(fpu_f64_t d)
{
#if defined(USE_SOFT_FPU_WRAP)
    return fpu_bit_u64_to_f64(fpu_bit_f64_to_u64(d) ^ FPU_LIB_FP64_SIGNEDFP_MASK);
#else
    return fpu_wrap_f64(-fpu_raw_f64(d));
#endif
}

static forceinline fpu_f32_t fpu_fsgnj32(fpu_f32_t a, fpu_f32_t b)
{
#if defined(FPU_LIB_OPTIMAL_BUILTIN_COPYSIGN)
    return fpu_wrap_f32(__builtin_copysignf(fpu_raw_f32(a), fpu_raw_f32(b)));
#else
    uint32_t ua = fpu_bit_f32_to_u32(a);
    uint32_t ub = fpu_bit_f32_to_u32(b);
    return fpu_bit_u32_to_f32(ua ^ ((ua ^ ub) & FPU_LIB_FP32_SIGNEDFP_MASK));
#endif
}

static forceinline fpu_f64_t fpu_fsgnj64(fpu_f64_t a, fpu_f64_t b)
{
#if defined(FPU_LIB_OPTIMAL_BUILTIN_COPYSIGN)
    return fpu_wrap_f64(__builtin_copysign(fpu_raw_f64(a), fpu_raw_f64(b)));
#else
    uint64_t ua = fpu_bit_f64_to_u64(a);
    uint64_t ub = fpu_bit_f64_to_u64(b);
    return fpu_bit_u64_to_f64(ua ^ ((ua ^ ub) & FPU_LIB_FP64_SIGNEDFP_MASK));
#endif
}

static forceinline fpu_f32_t fpu_fsgnjn32(fpu_f32_t a, fpu_f32_t b)
{
#if defined(FPU_LIB_OPTIMAL_BUILTIN_COPYSIGN)
    return fpu_wrap_f32(__builtin_copysignf(fpu_raw_f32(a), -fpu_raw_f32(b)));
#else
    uint32_t ua = fpu_bit_f32_to_u32(a);
    uint32_t ub = fpu_bit_f32_to_u32(b);
    return fpu_bit_u32_to_f32(ua ^ (~(ua ^ ub) & FPU_LIB_FP32_SIGNEDFP_MASK));
#endif
}

static forceinline fpu_f64_t fpu_fsgnjn64(fpu_f64_t a, fpu_f64_t b)
{
#if defined(FPU_LIB_OPTIMAL_BUILTIN_COPYSIGN)
    return fpu_wrap_f64(__builtin_copysign(fpu_raw_f64(a), -fpu_raw_f64(b)));
#else
    uint64_t ua = fpu_bit_f64_to_u64(a);
    uint64_t ub = fpu_bit_f64_to_u64(b);
    return fpu_bit_u64_to_f64(ua ^ (~(ua ^ ub) & FPU_LIB_FP64_SIGNEDFP_MASK));
#endif
}

static forceinline fpu_f32_t fpu_fsgnjx32(fpu_f32_t a, fpu_f32_t b)
{
    uint32_t ua = fpu_bit_f32_to_u32(a);
    uint32_t ub = fpu_bit_f32_to_u32(b);
    return fpu_bit_u32_to_f32(ua ^ (ub & FPU_LIB_FP32_SIGNEDFP_MASK));
}

static forceinline fpu_f64_t fpu_fsgnjx64(fpu_f64_t a, fpu_f64_t b)
{
    uint64_t ua = fpu_bit_f64_to_u64(a);
    uint64_t ub = fpu_bit_f64_to_u64(b);
    return fpu_bit_u64_to_f64(ua ^ (ub & FPU_LIB_FP64_SIGNEDFP_MASK));
}

/*
 * Floating-point width conversions
 */

static forceinline fpu_f64_t fpu_fcvt_f32_to_f64(fpu_f32_t f)
{
    return fpu_wrap_f64((actual_double_t)fpu_raw_f32(f));
}

static forceinline fpu_f32_t fpu_fcvt_f64_to_f32(fpu_f64_t d)
{
    return fpu_wrap_f32((actual_float_t)fpu_raw_f64(d));
}

/*
 * Floating-point Arithmetic
 */

static forceinline fpu_f32_t fpu_add32(fpu_f32_t a, fpu_f32_t b)
{
    fpu_f32_t sum = fpu_wrap_f32(fpu_raw_f32(a) + fpu_raw_f32(b));
#if defined(USE_SOFT_FPU_FENV)
    fpu_soft_fenv_check_add32(sum, a, b);
#endif
    return sum;
}

static forceinline fpu_f64_t fpu_add64(fpu_f64_t a, fpu_f64_t b)
{
    fpu_f64_t sum = fpu_wrap_f64(fpu_raw_f64(a) + fpu_raw_f64(b));
#if defined(USE_SOFT_FPU_FENV)
    fpu_soft_fenv_check_add64(sum, a, b);
#endif
    return sum;
}

static forceinline fpu_f32_t fpu_sub32(fpu_f32_t a, fpu_f32_t b)
{
    fpu_f32_t sub = fpu_wrap_f32(fpu_raw_f32(a) - fpu_raw_f32(b));
#if defined(USE_SOFT_FPU_FENV)
    fpu_soft_fenv_check_add32(sub, a, fpu_neg32(b));
#endif
    return sub;
}

static forceinline fpu_f64_t fpu_sub64(fpu_f64_t a, fpu_f64_t b)
{
    fpu_f64_t sub = fpu_wrap_f64(fpu_raw_f64(a) - fpu_raw_f64(b));
#if defined(USE_SOFT_FPU_FENV)
    fpu_soft_fenv_check_add64(sub, a, fpu_neg64(b));
#endif
    return sub;
}

static forceinline fpu_f32_t fpu_mul32(fpu_f32_t a, fpu_f32_t b)
{
    fpu_f32_t mul = fpu_wrap_f32(fpu_raw_f32(a) * fpu_raw_f32(b));
#if defined(USE_SOFT_FPU_FENV)
    fpu_soft_fenv_check_mul32(mul, a, b, false);
#endif
    return mul;
}

static forceinline fpu_f64_t fpu_mul64(fpu_f64_t a, fpu_f64_t b)
{
    fpu_f64_t mul = fpu_wrap_f64(fpu_raw_f64(a) * fpu_raw_f64(b));
#if defined(USE_SOFT_FPU_FENV)
    fpu_soft_fenv_check_mul64(mul, a, b, false);
#endif
    return mul;
}

static forceinline fpu_f32_t fpu_div32(fpu_f32_t a, fpu_f32_t b)
{
    fpu_f32_t div = fpu_wrap_f32(fpu_raw_f32(a) / fpu_raw_f32(b));
#if defined(USE_SOFT_FPU_FENV)
    fpu_soft_fenv_check_mul32(a, b, div, true);
#endif
    return div;
}

static forceinline fpu_f64_t fpu_div64(fpu_f64_t a, fpu_f64_t b)
{
    fpu_f64_t div = fpu_wrap_f64(fpu_raw_f64(a) / fpu_raw_f64(b));
#if defined(USE_SOFT_FPU_FENV)
    fpu_soft_fenv_check_mul64(a, b, div, true);
#endif
    return div;
}

static forceinline func_opt_size fpu_f32_t fpu_fma32(fpu_f32_t a, fpu_f32_t b, fpu_f32_t c)
{
#if defined(FPU_LIB_OPTIMAL_BUILTIN_FMA)
    return fpu_wrap_f32(__builtin_fmaf(fpu_raw_f32(a), fpu_raw_f32(b), fpu_raw_f32(c)));
#else
    // Multiply a * b as fp64, this is guaranteed be exact since mantissa is more than twice as large
    fpu_f64_t mul = fpu_mul64(fpu_fcvt_f32_to_f64(a), fpu_fcvt_f32_to_f64(b));
    fpu_f64_t add = fpu_fcvt_f32_to_f64(c);
    // Add mul + c as fp64, note this actually may incur a rounding error
    fpu_f64_t sum = fpu_add64(mul, add);
    fpu_f64_t err = fpu_add_error64(sum, mul, add);
    // Fixup the rounding error in final result, downcast to fp32 as a final rounding step
    fpu_f64_t res = fpu_odd_round64(sum, err);
    fpu_f32_t ret = fpu_fcvt_f64_to_f32(res);
#if defined(USE_SOFT_FPU_FENV)
    // Check the downcasted result against full precision sum + err
    fpu_soft_fenv_check_add64(fpu_fcvt_f32_to_f64(ret), sum, err);
#endif
    return ret;
#endif
}

static forceinline fpu_f64_t fpu_fma64(fpu_f64_t a, fpu_f64_t b, fpu_f64_t c)
{
#if defined(FPU_LIB_OPTIMAL_BUILTIN_FMA)
    return fpu_wrap_f64(__builtin_fma(fpu_raw_f64(a), fpu_raw_f64(b), fpu_raw_f64(c)));
#else
    // Multiply a * b and calculate rounding error
    fpu_f64_t mul = fpu_mul64(a, b);
    fpu_f64_t e_m = fpu_mul_error64(mul, a, b);
    // Add mul + c and calculate rounding error
    fpu_f64_t sum = fpu_add64(mul, c);
    fpu_f64_t e_s = fpu_add_error64(sum, mul, c);
    // Calculate the sum of previous rounding errors, fixup via odd rounding
    fpu_f64_t e_f = fpu_add64(e_m, e_s);
    fpu_f64_t err = fpu_odd_round64(e_f, fpu_add_error64(e_f, e_s, e_m));
    // Calculate final result via adding the oddly-rounded sum of rounding errors
    fpu_f64_t ret = fpu_add64(sum, err);
    return ret;
#endif
}

static forceinline fpu_f32_t fpu_sqrt32(fpu_f32_t f)
{
    if (likely(fpu_is_positive32(f))) {
        fpu_f32_t ret = fpu_wrap_f32(fpu_sqrtf_internal(fpu_raw_f32(f)));
#if defined(USE_SOFT_FPU_FENV)
        fpu_soft_fenv_check_mul32(f, ret, ret, false);
#endif
        return ret;
    }
    // Raise invalid flag, return a canonical NaN representation
    fpu_raise_invalid();
    return fpu_bit_u32_to_f32(FPU_LIB_FP32_CANONICAL_NAN);
}

static forceinline fpu_f64_t fpu_sqrt64(fpu_f64_t d)
{
    if (likely(fpu_is_positive64(d))) {
        fpu_f64_t ret = fpu_wrap_f64(fpu_sqrt_internal(fpu_raw_f64(d)));
#if defined(USE_SOFT_FPU_FENV)
        fpu_soft_fenv_check_mul64(d, ret, ret, false);
#endif
        return ret;
    }
    // Raise invalid flag, return a canonical NaN representation
    fpu_raise_invalid();
    return fpu_bit_u64_to_f64(FPU_LIB_FP64_CANONICAL_NAN);
}

/*
 * Integer to floating-point conversions
 */

static forceinline fpu_f32_t fpu_fcvt_u32_to_f32(uint32_t u)
{
#if !defined(FPU_LIB_CORRECT_CONVERSION_UINT_FP)
    // Use i64->f32 conversion for correct >0x80000000 values handling
    return fpu_wrap_f32((actual_float_t)(int64_t)u);
#else
    return fpu_wrap_f32((actual_float_t)u);
#endif
}

static forceinline fpu_f64_t fpu_fcvt_u32_to_f64(uint32_t u)
{
#if !defined(FPU_LIB_CORRECT_CONVERSION_UINT_FP)
    // Use i64->f64 conversion for correct >0x80000000 values handling
    return fpu_wrap_f64((actual_double_t)(int64_t)u);
#else
    return fpu_wrap_f64((actual_double_t)u);
#endif
}

static forceinline fpu_f32_t fpu_fcvt_i32_to_f32(int32_t i)
{
    return fpu_wrap_f32((actual_float_t)i);
}

static forceinline fpu_f64_t fpu_fcvt_i32_to_f64(int32_t i)
{
    return fpu_wrap_f64((actual_double_t)i);
}

static forceinline fpu_f32_t fpu_fcvt_u64_to_f32(uint64_t u)
{
#if !defined(FPU_LIB_CORRECT_CONVERSION_UINT_FP)
    if (likely(!(u >> 63))) {
        return fpu_wrap_f32((actual_float_t)(int64_t)u);
    }
    // Lower conversion from u64 as halved, evenly rounded i64, multiply back
    return fpu_wrap_f32(((actual_float_t)(int64_t)((u >> 1) | (u & 1))) * 2.f);
#else
    return fpu_wrap_f32((actual_float_t)u);
#endif
}

static forceinline fpu_f64_t fpu_fcvt_u64_to_f64(uint64_t u)
{
#if !defined(FPU_LIB_CORRECT_CONVERSION_UINT_FP)
    if (likely(!(u >> 63))) {
        return fpu_wrap_f64((actual_double_t)(int64_t)u);
    }
    // Lower conversion from u64 as halved, evenly rounded i64, multiply back
    return fpu_wrap_f64(((actual_double_t)(int64_t)((u >> 1) | (u & 1))) * 2.0);
#else
    return fpu_wrap_f64((actual_double_t)u);
#endif
}

static forceinline fpu_f32_t fpu_fcvt_i64_to_f32(int64_t i)
{
    return fpu_wrap_f32((actual_float_t)i);
}

static forceinline fpu_f64_t fpu_fcvt_i64_to_f64(int64_t i)
{
    return fpu_wrap_f64((actual_double_t)i);
}

/*
 * Check whether floating-point value fits an integer type, never raises exceptions
 */

static forceinline bool fpu_f32_fits_u32(fpu_f32_t f)
{
    uint32_t u = fpu_bit_f32_to_u32(f);
    return u < 0x4F800000U || ((int32_t)u) < ((int32_t)0xBF800000U);
}

static forceinline bool fpu_f64_fits_u32(fpu_f64_t d)
{
    uint64_t u = fpu_bit_f64_to_u64(d);
    return u < 0x41F0000000000000ULL || ((int64_t)u) < ((int64_t)0xBFF0000000000000ULL);
}

static forceinline bool fpu_f32_fits_i32(fpu_f32_t f)
{
    return (fpu_bit_f32_to_u32(f) << 1) < 0x9E000000U;
}

static forceinline bool fpu_f64_fits_i32(fpu_f64_t d)
{
    return (fpu_bit_f64_to_u64(d) << 1) < 0x83C0000000000000ULL;
}

static forceinline bool fpu_f32_fits_u64(fpu_f32_t f)
{
    uint32_t u = fpu_bit_f32_to_u32(f);
    return u < 0x5F800000U || ((int32_t)u) < ((int32_t)0xBF800000U);
}

static forceinline bool fpu_f64_fits_u64(fpu_f64_t d)
{
    uint64_t u = fpu_bit_f64_to_u64(d);
    return u < 0x43F0000000000000ULL || ((int64_t)u) < ((int64_t)0xBFF0000000000000ULL);
}

static forceinline bool fpu_f32_fits_i64(fpu_f32_t f)
{
    return (fpu_bit_f32_to_u32(f) << 1) < 0xBE000000U;
}

static forceinline bool fpu_f64_fits_i64(fpu_f64_t d)
{
    return (fpu_bit_f64_to_u64(d) << 1) < 0x87C0000000000000ULL;
}

/*
 * Floating-point to integer conversions
 *
 * - Converting a value outside output range raises invalid flag & saturates the output
 * - Converting a fractional value raises inexact flag
 * - NaN is treated like positive infinity
 */

// Float to integer
static forceinline uint32_t fpu_fcvt_f32_to_u32(fpu_f32_t f)
{
    if (likely(fpu_f32_fits_u32(f))) {
#if defined(USE_SOFT_FPU_FENV) || !defined(FPU_LIB_CORRECT_CONVERSION_FP_UINT)
        uint32_t ret = (int64_t)fpu_raw_f32(f);
        if (unlikely(!fpu_is_bit_equal32(f, fpu_fcvt_u32_to_f32(ret)))) {
            fpu_raise_inexact();
        }
        return ret;
#else
        return (uint32_t)fpu_raw_f32(f);
#endif
    }
    fpu_raise_invalid();
    if (fpu_is_negative32(f)) {
        return 0;
    }
    return -1;
}

static forceinline uint32_t fpu_fcvt_f64_to_u32(fpu_f64_t d)
{
    if (likely(fpu_f64_fits_u32(d))) {
#if defined(USE_SOFT_FPU_FENV) || !defined(FPU_LIB_CORRECT_CONVERSION_FP_UINT)
        uint32_t ret = (int64_t)fpu_raw_f64(d);
        if (unlikely(!fpu_is_bit_equal64(d, fpu_fcvt_u32_to_f64(ret)))) {
            fpu_raise_inexact();
        }
        return ret;
#else
        return (uint32_t)fpu_raw_f64(d);
#endif
    }
    fpu_raise_invalid();
    if (fpu_is_negative64(d)) {
        return 0;
    }
    return -1;
}

static forceinline int32_t fpu_fcvt_f32_to_i32(fpu_f32_t f)
{
    if (likely(fpu_f32_fits_i32(f))) {
        int32_t ret = (int32_t)fpu_raw_f32(f);
#if defined(USE_SOFT_FPU_FENV)
        if (unlikely(!fpu_is_bit_equal32(f, fpu_fcvt_i32_to_f32(ret)))) {
            fpu_raise_inexact();
        }
#endif
        return ret;
    }
    fpu_raise_invalid();
    if (fpu_is_negative32(f)) {
        return 0x80000000U;
    }
    return 0x7FFFFFFFU;
}

static forceinline int32_t fpu_fcvt_f64_to_i32(fpu_f64_t d)
{
    if (likely(fpu_f64_fits_i32(d))) {
        int32_t ret = (int32_t)fpu_raw_f64(d);
#if defined(USE_SOFT_FPU_FENV)
        if (unlikely(!fpu_is_bit_equal64(d, fpu_fcvt_i32_to_f64(ret)))) {
            fpu_raise_inexact();
        }
#endif
        return ret;
    }
    fpu_raise_invalid();
    if (fpu_is_negative64(d)) {
        return 0x80000000U;
    }
    return 0x7FFFFFFFU;
}

static forceinline uint64_t fpu_fcvt_f32_to_u64(fpu_f32_t f)
{
    if (likely(fpu_f32_fits_u64(f))) {
#if defined(USE_SOFT_FPU_FENV) || !defined(FPU_LIB_CORRECT_CONVERSION_FP_UINT)
        uint64_t ret;
        if (likely(fpu_f32_fits_i64(f))) {
            ret = (int64_t)fpu_raw_f32(f);
        } else {
            // Emulate conversion to u64 in software
            ret = (((uint64_t)fpu_bit_f32_to_u32(f)) << 40) - 0x8000000000000000ULL;
        }
        if (unlikely(!fpu_is_bit_equal32(f, fpu_fcvt_u64_to_f32(ret)))) {
            fpu_raise_inexact();
        }
        return ret;
#else
        return (uint64_t)fpu_raw_f32(f);
#endif
    }
    fpu_raise_invalid();
    if (fpu_is_negative32(f)) {
        return 0;
    }
    return -1;
}

static forceinline uint64_t fpu_fcvt_f64_to_u64(fpu_f64_t d)
{
    if (likely(fpu_f64_fits_u64(d))) {
#if defined(USE_SOFT_FPU_FENV) || !defined(FPU_LIB_CORRECT_CONVERSION_FP_UINT)
        uint64_t ret;
        if (likely(fpu_f64_fits_i64(d))) {
            ret = (int64_t)fpu_raw_f64(d);
        } else {
            // Emulate conversion to u64 in software
            ret = (fpu_bit_f64_to_u64(d) << 11) - 0x8000000000000000ULL;
        }
        if (unlikely(!fpu_is_bit_equal64(d, fpu_fcvt_u64_to_f64(ret)))) {
            fpu_raise_inexact();
        }
        return ret;
#else
        return (uint64_t)fpu_raw_f64(d);
#endif
    }
    fpu_raise_invalid();
    if (fpu_is_negative64(d)) {
        return 0;
    }
    return -1;
}

static forceinline int64_t fpu_fcvt_f32_to_i64(fpu_f32_t f)
{
    if (likely(fpu_f32_fits_i64(f))) {
        int64_t ret = (int64_t)fpu_raw_f32(f);
#if defined(USE_SOFT_FPU_FENV) || !defined(FPU_LIB_CORRECT_CONVERSION_FP_LONG)
        if (unlikely(!fpu_is_bit_equal32(f, fpu_fcvt_i64_to_f32(ret)))) {
            fpu_raise_inexact();
        }
#endif
        return ret;
    }
    fpu_raise_invalid();
    if (fpu_is_negative32(f)) {
        return 0x8000000000000000ULL;
    }
    return 0x7FFFFFFFFFFFFFFFULL;
}

static forceinline int64_t fpu_fcvt_f64_to_i64(fpu_f64_t d)
{
    if (likely(fpu_f64_fits_i64(d))) {
        int64_t ret = (int64_t)fpu_raw_f64(d);
#if defined(USE_SOFT_FPU_FENV) || !defined(FPU_LIB_CORRECT_CONVERSION_FP_LONG)
        if (unlikely(!fpu_is_bit_equal64(d, fpu_fcvt_i64_to_f64(ret)))) {
            fpu_raise_inexact();
        }
#endif
        return ret;
    }
    fpu_raise_invalid();
    if (fpu_is_negative64(d)) {
        return 0x8000000000000000ULL;
    }
    return 0x7FFFFFFFFFFFFFFFULL;
}

/*
 * Floating-point to integer rounding
 */

static forceinline uint32_t fpu_round_f32_to_u32(fpu_f32_t f, uint32_t mode)
{
    if (likely(mode == FPU_LIB_ROUND_TZ)) {
        return fpu_fcvt_f32_to_u32(f);
    }
    return fpu_fcvt_f32_to_u32(fpu_round_f32_internal(f, mode));
}

static forceinline uint32_t fpu_round_f64_to_u32(fpu_f64_t d, uint32_t mode)
{
    if (likely(mode == FPU_LIB_ROUND_TZ)) {
        return fpu_fcvt_f64_to_u32(d);
    }
    return fpu_fcvt_f64_to_u32(fpu_round_f64_internal(d, mode));
}

static forceinline int32_t fpu_round_f32_to_i32(fpu_f32_t f, uint32_t mode)
{
    if (likely(mode == FPU_LIB_ROUND_TZ)) {
        return fpu_fcvt_f32_to_i32(f);
    }
    return fpu_fcvt_f32_to_i32(fpu_round_f32_internal(f, mode));
}

static forceinline int32_t fpu_round_f64_to_i32(fpu_f64_t d, uint32_t mode)
{
    if (likely(mode == FPU_LIB_ROUND_TZ)) {
        return fpu_fcvt_f64_to_i32(d);
    }
    return fpu_fcvt_f64_to_i32(fpu_round_f64_internal(d, mode));
}

static forceinline uint64_t fpu_round_f32_to_u64(fpu_f32_t f, uint32_t mode)
{
    if (likely(mode == FPU_LIB_ROUND_TZ)) {
        return fpu_fcvt_f32_to_u64(f);
    }
    return fpu_fcvt_f32_to_u64(fpu_round_f32_internal(f, mode));
}

static forceinline uint64_t fpu_round_f64_to_u64(fpu_f64_t d, uint32_t mode)
{
    if (likely(mode == FPU_LIB_ROUND_TZ)) {
        return fpu_fcvt_f64_to_u64(d);
    }
    return fpu_fcvt_f64_to_u64(fpu_round_f64_internal(d, mode));
}

static forceinline int64_t fpu_round_f32_to_i64(fpu_f32_t f, uint32_t mode)
{
    if (likely(mode == FPU_LIB_ROUND_TZ)) {
        return fpu_fcvt_f32_to_i64(f);
    }
    return fpu_fcvt_f32_to_i64(fpu_round_f32_internal(f, mode));
}

static forceinline int64_t fpu_round_f64_to_i64(fpu_f64_t d, uint32_t mode)
{
    if (likely(mode == FPU_LIB_ROUND_TZ)) {
        return fpu_fcvt_f64_to_i64(d);
    }
    return fpu_fcvt_f64_to_i64(fpu_round_f64_internal(d, mode));
}

/*
 * IEEE 754 Signaling Comparisons
 *
 * - If at least one operand is a NaN, the result is false, and invalid operation exception flag is set
 * - Negative zero -0.0 is not distinguished from 0.0
 */

// Comparison: Less than (<)
static forceinline bool fpu_is_flt32_sig(fpu_f32_t a, fpu_f32_t b)
{
#if defined(FPU_LIB_CORRECT_SIGNALING_COMPARE)
    return fpu_raw_f32(a) < fpu_raw_f32(b);
#else
    if (fpu_raw_f32(a) < fpu_raw_f32(b)) {
        return true;
    } else if (likely(fpu_raw_f32(b) <= fpu_raw_f32(a))) {
        return false;
    }
    fpu_raise_invalid();
    return false;
#endif
}

static forceinline bool fpu_is_flt64_sig(fpu_f64_t a, fpu_f64_t b)
{
#if defined(FPU_LIB_CORRECT_SIGNALING_COMPARE)
    return fpu_raw_f64(a) < fpu_raw_f64(b);
#else
    if (fpu_raw_f64(a) < fpu_raw_f64(b)) {
        return true;
    } else if (likely(fpu_raw_f64(b) <= fpu_raw_f64(a))) {
        return false;
    }
    fpu_raise_invalid();
    return false;
#endif
}

// Comparison: Less than or equal (<=)
static forceinline bool fpu_is_fle32_sig(fpu_f32_t a, fpu_f32_t b)
{
#if defined(FPU_LIB_CORRECT_SIGNALING_COMPARE)
    return fpu_raw_f32(a) <= fpu_raw_f32(b);
#else
    if (fpu_raw_f32(a) <= fpu_raw_f32(b)) {
        return true;
    } else if (likely(fpu_raw_f32(b) < fpu_raw_f32(a))) {
        return false;
    }
    fpu_raise_invalid();
    return false;
#endif
}

static forceinline bool fpu_is_fle64_sig(fpu_f64_t a, fpu_f64_t b)
{
#if defined(FPU_LIB_CORRECT_SIGNALING_COMPARE)
    return fpu_raw_f64(a) <= fpu_raw_f64(b);
#else
    if (fpu_raw_f64(a) <= fpu_raw_f64(b)) {
        return true;
    } else if (likely(fpu_raw_f64(b) < fpu_raw_f64(a))) {
        return false;
    }
    fpu_raise_invalid();
    return false;
#endif
}

/*
 * IEEE 754 Quiet Comparisons
 *
 * - If at least one operand is a NaN, the result is false
 * - If at least one operand is a signaling NaN, the invalid operation exception flag is set
 * - Infinity is handled as being larger than anything finite, -Inf < any non-Nan, etc
 * - Negative zero -0.0 is not distinguished from 0.0
 */

// Comparison: Equal (a == b)
static forceinline bool fpu_is_equal32_quiet(fpu_f32_t a, fpu_f32_t b)
{
#if defined(USE_SOFT_FPU_FENV)
    if (likely(fpu_raw_f32(a) == fpu_raw_f32(b))) {
        return true;
    }
    if (fpu_is_snan32_soft(a) || fpu_is_snan32_soft(b)) {
        fpu_raise_invalid();
    }
    return false;
#else
    return fpu_raw_f32(a) == fpu_raw_f32(b);
#endif
}

static forceinline bool fpu_is_equal64_quiet(fpu_f64_t a, fpu_f64_t b)
{
#if defined(USE_SOFT_FPU_FENV)
    if (likely(fpu_raw_f64(a) == fpu_raw_f64(b))) {
        return true;
    }
    if (fpu_is_snan64_soft(a) || fpu_is_snan64_soft(b)) {
        fpu_raise_invalid();
    }
    return false;
#else
    return fpu_raw_f64(a) == fpu_raw_f64(b);
#endif
}

// Comparison: Less than (a < b)
static forceinline bool fpu_is_flt32_quiet(fpu_f32_t a, fpu_f32_t b)
{
#if defined(FPU_LIB_CORRECT_BUILTIN_QUIET_COMPARE)
    return __builtin_isless(fpu_raw_f32(a), fpu_raw_f32(b));
#else
    if (likely(!fpu_is_nan32(a) && !fpu_is_nan32(b))) {
        return fpu_raw_f32(a) < fpu_raw_f32(b);
    }
    return false;
#endif
}

static forceinline bool fpu_is_flt64_quiet(fpu_f64_t a, fpu_f64_t b)
{
#if defined(FPU_LIB_CORRECT_BUILTIN_QUIET_COMPARE)
    return __builtin_isless(fpu_raw_f64(a), fpu_raw_f64(b));
#else
    if (likely(!fpu_is_nan64(a) && !fpu_is_nan64(b))) {
        return fpu_raw_f64(a) < fpu_raw_f64(b);
    }
    return false;
#endif
}

// Comparison: Less than or equal (a <= b)
static forceinline bool fpu_is_fle32_quiet(fpu_f32_t a, fpu_f32_t b)
{
#if defined(FPU_LIB_CORRECT_BUILTIN_QUIET_COMPARE)
    return __builtin_islessequal(fpu_raw_f32(a), fpu_raw_f32(b));
#else
    if (likely(!fpu_is_nan32(a) && !fpu_is_nan32(b))) {
        return fpu_raw_f32(a) <= fpu_raw_f32(b);
    }
    return false;
#endif
}

static forceinline bool fpu_is_fle64_quiet(fpu_f64_t a, fpu_f64_t b)
{
#if defined(FPU_LIB_CORRECT_BUILTIN_QUIET_COMPARE)
    return __builtin_islessequal(fpu_raw_f64(a), fpu_raw_f64(b));
#else
    if (likely(!fpu_is_nan64(a) && !fpu_is_nan64(b))) {
        return fpu_raw_f64(a) <= fpu_raw_f64(b);
    }
    return false;
#endif
}

/*
 * IEEE 754-2008 minNum and maxNum
 *
 * - If one operand is a NaN, the result is the non-NaN operand
 * - If at least one operand is a signaling NaN, the invalid operation exception flag is set
 * - Negative zero -0.0 is considered smaller than 0.0
 */

static forceinline fpu_f32_t fpu_min32(fpu_f32_t a, fpu_f32_t b)
{
#if defined(FPU_LIB_CORRECT_BUILTIN_IEEE754_2008_FMINMAX)
    // On RISC-V, __builtin_fminf() actually behaves the way we need
    return fpu_wrap_f32(__builtin_fminf(fpu_raw_f32(a), fpu_raw_f32(b)));
#else
    if (fpu_is_flt32_quiet(a, b)) {
        return a;
    } else if (fpu_is_flt32_quiet(b, a)) {
        return b;
    } else if (fpu_is_negative32(b)) {
        return b;
    } else if (!fpu_is_nan32_fast_soft(a)) {
        return a;
    } else if (!fpu_is_nan32_fast_soft(b)) {
        return b;
    }
    return fpu_bit_u32_to_f32(FPU_LIB_FP32_CANONICAL_NAN);
#endif
}

static forceinline fpu_f64_t fpu_min64(fpu_f64_t a, fpu_f64_t b)
{
#if defined(FPU_LIB_CORRECT_BUILTIN_IEEE754_2008_FMINMAX)
    return fpu_wrap_f64(__builtin_fmin(fpu_raw_f64(a), fpu_raw_f64(b)));
#else
    if (fpu_is_flt64_quiet(a, b)) {
        return a;
    } else if (fpu_is_flt64_quiet(b, a)) {
        return b;
    } else if (fpu_is_negative64(b)) {
        return b;
    } else if (!fpu_is_nan64_fast_soft(a)) {
        return a;
    } else if (!fpu_is_nan64_fast_soft(b)) {
        return b;
    }
    return fpu_bit_u64_to_f64(FPU_LIB_FP64_CANONICAL_NAN);
#endif
}

static forceinline fpu_f32_t fpu_max32(fpu_f32_t a, fpu_f32_t b)
{
#if defined(FPU_LIB_CORRECT_BUILTIN_IEEE754_2008_FMINMAX)
    return fpu_wrap_f32(__builtin_fmaxf(fpu_raw_f32(a), fpu_raw_f32(b)));
#else
    if (fpu_is_flt32_quiet(a, b)) {
        return b;
    } else if (fpu_is_flt32_quiet(b, a)) {
        return a;
    } else if (fpu_is_positive32(b)) {
        return b;
    } else if (!fpu_is_nan32_fast_soft(a)) {
        return a;
    } else if (!fpu_is_nan32_fast_soft(b)) {
        return b;
    }
    return fpu_bit_u32_to_f32(FPU_LIB_FP32_CANONICAL_NAN);
#endif
}

static forceinline fpu_f64_t fpu_max64(fpu_f64_t a, fpu_f64_t b)
{
#if defined(FPU_LIB_CORRECT_BUILTIN_IEEE754_2008_FMINMAX)
    return fpu_wrap_f64(__builtin_fmax(fpu_raw_f64(a), fpu_raw_f64(b)));
#else
    if (fpu_is_flt64_quiet(a, b)) {
        return b;
    } else if (fpu_is_flt64_quiet(b, a)) {
        return a;
    } else if (fpu_is_positive64(b)) {
        return b;
    } else if (!fpu_is_nan64_fast_soft(a)) {
        return a;
    } else if (!fpu_is_nan64_fast_soft(b)) {
        return b;
    }
    return fpu_bit_u64_to_f64(FPU_LIB_FP64_CANONICAL_NAN);
#endif
}

#endif
